# Load of libraries
library(readr) # Read cvs
library(dplyr) # Handle data and use of pipes
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) # Handle dataframes
library(ggplot2) # Useful library to visualizations
library(lubridate) # Handle with dates
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(randomForest) # Random Forest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(Metrics) # Use of RMSE metric
library(ggcorrplot) # Correlation plot
library(caret) # ML library
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(scales) # To rescale output values
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(xts)
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tictoc)
rm(list = ls(all.names = TRUE))
setwd("/Users/pablosalarcarrera/Desktop/Master Classes/DS636/")
# Load datasets
sales_train <- read.csv("sales_train.csv", header=T)
items <- read.csv("items.csv", header=T)
items_categories <- read.csv("item_categories.csv", header=T)
shops <- read.csv("shops.csv", header=T)
# Print first 5 rows of each dataset
head(sales_train)
## date date_block_num shop_id item_id item_price item_cnt_day
## 1 02.01.2013 0 59 22154 999.00 1
## 2 03.01.2013 0 25 2552 899.00 1
## 3 05.01.2013 0 25 2552 899.00 -1
## 4 06.01.2013 0 25 2554 1709.05 1
## 5 15.01.2013 0 25 2555 1099.00 1
## 6 10.01.2013 0 25 2564 349.00 1
head(items)
## item_name item_id
## 1 ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D 0
## 2 !ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия] 1
## 3 ***В ЛУЧАХ СЛАВЫ (UNV) D 2
## 4 ***ГОЛУБАЯ ВОЛНА (Univ) D 3
## 5 ***КОРОБКА (СТЕКЛО) D 4
## 6 ***НОВЫЕ АМЕРИКАНСКИЕ ГРАФФИТИ (UNI) D 5
## item_category_id
## 1 40
## 2 76
## 3 40
## 4 40
## 5 40
## 6 40
head(items_categories)
## item_category_name item_category_id
## 1 PC - Гарнитуры/Наушники 0
## 2 Аксессуары - PS2 1
## 3 Аксессуары - PS3 2
## 4 Аксессуары - PS4 3
## 5 Аксессуары - PSP 4
## 6 Аксессуары - PSVita 5
head(shops)
## shop_name shop_id
## 1 !Якутск Орджоникидзе, 56 фран 0
## 2 !Якутск ТЦ "Центральный" фран 1
## 3 Адыгея ТЦ "Мега" 2
## 4 Балашиха ТРК "Октябрь-Киномир" 3
## 5 Волжский ТЦ "Волга Молл" 4
## 6 Вологда ТРЦ "Мармелад" 5
# Merge datasets into a unique dataset
sales <- sales_train %>% left_join(items,by = "item_id") %>% left_join(items_categories,by = "item_category_id") %>% left_join(shops,by = "shop_id")
# Fix the format of date column
sales$date <- as.Date(sales$date,"%d.%m.%Y")
# We may want to know sales in terms of day, month, weekday and year, so create four new columns.
sales$day = day(sales$date)
sales$month = month(sales$date)
sales$weekday = wday(sales$date)
sales$year = year(sales$date)
# Transform IDs into categorical variables
sales$item_id = factor(sales$item_id)
sales$item_category_id = factor(sales$item_category_id)
sales$shop_id = factor(sales$shop_id)
# Print first rows of the new dataset
head(sales)
## date date_block_num shop_id item_id item_price item_cnt_day
## 1 2013-01-02 0 59 22154 999.00 1
## 2 2013-01-03 0 25 2552 899.00 1
## 3 2013-01-05 0 25 2552 899.00 -1
## 4 2013-01-06 0 25 2554 1709.05 1
## 5 2013-01-15 0 25 2555 1099.00 1
## 6 2013-01-10 0 25 2564 349.00 1
## item_name item_category_id
## 1 ЯВЛЕНИЕ 2012 (BD) 37
## 2 DEEP PURPLE The House Of Blue Light LP 58
## 3 DEEP PURPLE The House Of Blue Light LP 58
## 4 DEEP PURPLE Who Do You Think We Are LP 58
## 5 DEEP PURPLE 30 Very Best Of 2CD (Фирм.) 56
## 6 DEEP PURPLE Perihelion: Live In Concert DVD (Кир.) 59
## item_category_name shop_name day month weekday
## 1 Кино - Blu-Ray Ярославль ТЦ "Альтаир" 2 1 4
## 2 Музыка - Винил Москва ТРК "Атриум" 3 1 5
## 3 Музыка - Винил Москва ТРК "Атриум" 5 1 7
## 4 Музыка - Винил Москва ТРК "Атриум" 6 1 1
## 5 Музыка - CD фирменного производства Москва ТРК "Атриум" 15 1 3
## 6 Музыка - Музыкальное видео Москва ТРК "Атриум" 10 1 5
## year
## 1 2013
## 2 2013
## 3 2013
## 4 2013
## 5 2013
## 6 2013
# Check missing values
sum(is.na(sales))
## [1] 0
sum(duplicated(sales))
## [1] 6
which(duplicated(sales))
## [1] 76963 1435368 1496767 1671874 1866341 2198567
# Eliminate duplicated values
sales <- sales[!duplicated(sales), ]
# Check lowest value in item price
min(sales$item_price)
## [1] -1
# Price cannot be negative
sales <- sales[sales$item_price>=0, ]
boxplot(sales$`item_price`,
ylab = "Item price"
)

# Deal with outliers
sales <- sales[sales$item_price < 100000, ]
boxplot(sales$`item_price`,
ylab = "Item price"
)

sales <- sales[sales$item_price < 40000, ]
boxplot(sales$`item_price`,
ylab = "Item price"
)

glimpse(sales)
## Rows: 2,935,828
## Columns: 14
## $ date <date> 2013-01-02, 2013-01-03, 2013-01-05, 2013-01-06, 20…
## $ date_block_num <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ shop_id <fct> 59, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,…
## $ item_id <fct> 22154, 2552, 2552, 2554, 2555, 2564, 2565, 2572, 25…
## $ item_price <dbl> 999.00, 899.00, 899.00, 1709.05, 1099.00, 349.00, 5…
## $ item_cnt_day <dbl> 1, 1, -1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 2, 1, 2, 1,…
## $ item_name <chr> "ЯВЛЕНИЕ 2012 (BD)", "DEEP PURPLE The House Of Blu…
## $ item_category_id <fct> 37, 58, 58, 58, 56, 59, 56, 55, 55, 55, 55, 55, 55,…
## $ item_category_name <chr> "Кино - Blu-Ray", "Музыка - Винил", "Музыка - Винил…
## $ shop_name <chr> "Ярославль ТЦ \"Альтаир\"", "Москва ТРК \"Атриум\""…
## $ day <int> 2, 3, 5, 6, 15, 10, 2, 4, 11, 3, 3, 5, 7, 8, 10, 11…
## $ month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ weekday <dbl> 4, 5, 7, 1, 3, 5, 4, 6, 6, 5, 5, 7, 2, 3, 5, 6, 1, …
## $ year <dbl> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 201…
# Remove the columns with the names
sales <- sales %>% select(-item_name, -item_category_name, -shop_name)
glimpse(sales)
## Rows: 2,935,828
## Columns: 11
## $ date <date> 2013-01-02, 2013-01-03, 2013-01-05, 2013-01-06, 2013…
## $ date_block_num <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ shop_id <fct> 59, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2…
## $ item_id <fct> 22154, 2552, 2552, 2554, 2555, 2564, 2565, 2572, 2572…
## $ item_price <dbl> 999.00, 899.00, 899.00, 1709.05, 1099.00, 349.00, 549…
## $ item_cnt_day <dbl> 1, 1, -1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 2, 1, 2, 1, 1…
## $ item_category_id <fct> 37, 58, 58, 58, 56, 59, 56, 55, 55, 55, 55, 55, 55, 5…
## $ day <int> 2, 3, 5, 6, 15, 10, 2, 4, 11, 3, 3, 5, 7, 8, 10, 11, …
## $ month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ weekday <dbl> 4, 5, 7, 1, 3, 5, 4, 6, 6, 5, 5, 7, 2, 3, 5, 6, 1, 4,…
## $ year <dbl> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(sales)
## sales
##
## 11 Variables 2935828 Observations
## --------------------------------------------------------------------------------
## date
## n missing distinct Info Mean Gmd .05
## 2935828 0 1034 1 2014-04-03 330.2 2013-02-09
## .10 .25 .50 .75 .90 .95
## 2013-03-17 2013-08-01 2014-03-04 2014-12-05 2015-05-20 2015-08-09
##
## lowest : 2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05
## highest: 2015-10-27 2015-10-28 2015-10-29 2015-10-30 2015-10-31
## --------------------------------------------------------------------------------
## date_block_num
## n missing distinct Info Mean Gmd .05 .10
## 2935828 0 34 0.999 14.57 10.84 1 2
## .25 .50 .75 .90 .95
## 7 14 23 28 31
##
## lowest : 0 1 2 3 4, highest: 29 30 31 32 33
## --------------------------------------------------------------------------------
## shop_id
## n missing distinct
## 2935828 0 60
##
## lowest : 0 1 2 3 4 , highest: 55 56 57 58 59
## --------------------------------------------------------------------------------
## item_id
## n missing distinct
## 2935828 0 21802
##
## lowest : 0 1 2 3 4 , highest: 22165 22166 22167 22168 22169
## --------------------------------------------------------------------------------
## item_price
## n missing distinct Info Mean Gmd .05 .10
## 2935828 0 19983 0.998 890.6 1025 99 149
## .25 .50 .75 .90 .95
## 249 399 999 1999 2690
##
## lowest : 0.0700 0.0875 0.0900 0.1000 0.2000
## highest: 35490.0000 35990.0000 35991.0000 36990.0000 37991.0000
## --------------------------------------------------------------------------------
## item_cnt_day
## n missing distinct Info Mean Gmd .05 .10
## 2935828 0 198 0.281 1.243 0.4806 1 1
## .25 .50 .75 .90 .95
## 1 1 1 2 2
##
## lowest : -22 -16 -9 -6 -5, highest: 624 637 669 1000 2169
## --------------------------------------------------------------------------------
## item_category_id
## n missing distinct
## 2935828 0 84
##
## lowest : 0 1 2 3 4 , highest: 79 80 81 82 83
## --------------------------------------------------------------------------------
## day
## n missing distinct Info Mean Gmd .05 .10
## 2935828 0 31 0.999 15.85 10.3 2 3
## .25 .50 .75 .90 .95
## 8 16 24 28 30
##
## lowest : 1 2 3 4 5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## month
## n missing distinct Info Mean Gmd .05 .10
## 2935828 0 12 0.993 6.248 4.065 1 1
## .25 .50 .75 .90 .95
## 3 6 9 11 12
##
## lowest : 1 2 3 4 5, highest: 8 9 10 11 12
##
## Value 1 2 3 4 5 6 7 8 9
## Frequency 303559 270250 284055 228289 224834 237428 234856 248415 219881
## Proportion 0.103 0.092 0.097 0.078 0.077 0.081 0.080 0.085 0.075
##
## Value 10 11 12
## Frequency 227068 183163 274030
## Proportion 0.077 0.062 0.093
## --------------------------------------------------------------------------------
## weekday
## n missing distinct Info Mean Gmd
## 2935828 0 7 0.977 4.166 2.45
##
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##
## Value 1 2 3 4 5 6 7
## Frequency 503102 337074 345767 352960 367272 439296 590357
## Proportion 0.171 0.115 0.118 0.120 0.125 0.150 0.201
## --------------------------------------------------------------------------------
## year
## n missing distinct Info Mean Gmd
## 2935828 0 3 0.864 2014 0.8209
##
## Value 2013 2014 2015
## Frequency 1267557 1055854 612417
## Proportion 0.432 0.360 0.209
## --------------------------------------------------------------------------------
# Check how many different items
sales %>% select(item_id) %>% distinct() %>% count()
## n
## 1 21802
# We can extract the numbers of shops, products and categories
print(paste0("numbers of shops: ",length(unique(sales$shop_id))))
## [1] "numbers of shops: 60"
print(paste0("numbers of products: ",length(unique(sales$item_id))))
## [1] "numbers of products: 21802"
print(paste0("numbers of categories: ",length(unique(sales$item_category_id))))
## [1] "numbers of categories: 84"
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
##
## subplot
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(reactable)
library(htmlwidgets)
library('IRdisplay')
library(htmltools)
product_sold_Year<-sales%>%group_by(year)%>%summarise(Product_sold_year=sum(item_cnt_day))
product_sold_Year
## # A tibble: 3 × 2
## year Product_sold_year
## <dbl> <dbl>
## 1 2013 1562728
## 2 2014 1320882
## 3 2015 764575
v1 <- plot_ly(product_sold_Year, x = ~year, y = ~Product_sold_year, color = ~year, colors = c("#4C3A51", "#774360", "#B25068"), type = 'bar') %>%
layout(title = 'Total Product Sold Per Year', yaxis = list(title = 'Total Item Sold'))
v1
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
sales_price_Year<-sales%>%group_by(year)%>%summarise(Sales_value_year=sum(item_price*item_cnt_day))
sales_price_Year
## # A tibble: 3 × 2
## year Sales_value_year
## <dbl> <dbl>
## 1 2013 1217115406.
## 2 2014 1346682085.
## 3 2015 834234429.
v4<-plot_ly(sales_price_Year, x=~year, y=~Sales_value_year, color = ~year, colors = c("#764AF1","#9772FB","#F2F2F2"), type='bar')%>% layout(title='Total Sales Value Per Year', yaxis=list(title='Total Sales Value'))
v4
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
data_aggr1 <- aggregate(cbind(item_price, item_cnt_day) ~ month + year, sales, FUN = sum)
head(data_aggr1)
## month year item_price item_cnt_day
## 1 1 2013 82211725 131478
## 2 2 2013 75580187 128090
## 3 3 2013 84298312 147142
## 4 4 2013 61512823 107190
## 5 5 2013 57274133 106969
## 6 6 2013 63343615 125381
# Convert the revenue per month to millions
data_aggr1$item_price <- round(data_aggr1$item_price/1000000)
# Convert the items sold per month to thousands
data_aggr1$item_cnt_day <- round(data_aggr1$item_cnt_day/1000)
colnames(data_aggr1) <- c('month','year','Revenue (in millions $)', 'Number of items sold (thousands)')
head(data_aggr1)
## month year Revenue (in millions $) Number of items sold (thousands)
## 1 1 2013 82 131
## 2 2 2013 76 128
## 3 3 2013 84 147
## 4 4 2013 62 107
## 5 5 2013 57 107
## 6 6 2013 63 125
tail(data_aggr1)
## month year Revenue (in millions $) Number of items sold (thousands)
## 29 5 2015 59 72
## 30 6 2015 56 64
## 31 7 2015 54 63
## 32 8 2015 54 66
## 33 9 2015 59 73
## 34 10 2015 65 71
forecasting_revenue <- data_aggr1 %>% select(`Revenue (in millions $)`)
df1 <- ts(forecasting_revenue,frequency=12,start=c(2013,1),end=c(2015,10))
head(df1)
## Revenue (in millions $)
## [1,] 82
## [2,] 76
## [3,] 84
## [4,] 62
## [5,] 57
## [6,] 63
library(ggfortify)
library(forecast)
## Registered S3 methods overwritten by 'forecast':
## method from
## autoplot.Arima ggfortify
## autoplot.acf ggfortify
## autoplot.ar ggfortify
## autoplot.bats ggfortify
## autoplot.decomposed.ts ggfortify
## autoplot.ets ggfortify
## autoplot.forecast ggfortify
## autoplot.stl ggfortify
## autoplot.ts ggfortify
## fitted.ar ggfortify
## fortify.ts ggfortify
## residuals.ar ggfortify
##
## Attaching package: 'forecast'
## The following object is masked from 'package:Metrics':
##
## accuracy
revenue_yearly_plot <- autoplot(df1)+geom_smooth(method='loess') + ggtitle('Revenue of Sales of Products Yearly')+xlab('Year') +ylab('$ Millions')
revenue_seasonal_plot <- ggseasonplot(df1, year.labels=TRUE, year.labels.left=TRUE) +
ylab("$ million") +
ggtitle("Seasonal plot: Product Sales Revenue")
revenue_seasonal_plot2 <- ggseasonplot(df1, polar=TRUE) +
ylab("$ million") +
ggtitle("Seasonal plot: Sale of Products Revenue")
forecasting_items_sold <- data_aggr1 %>% select(`Number of items sold (thousands)`)
df2 <- ts(forecasting_items_sold,frequency=12,start=c(2013,1),end=c(2015,10))
head(df2)
## Jan Feb Mar Apr May Jun
## 2013 131 128 147 107 107 125
items_soldyearly_plot <- autoplot(df2)+geom_smooth(method='loess') + ggtitle('Sales of Products Yearly')+xlab('Year') +ylab('Items sold (Thousand)')
items_soldseasonal_plot <- ggseasonplot(df2, year.labels=TRUE, year.labels.left=TRUE) +
ylab("Items sold (Thousand)") +
ggtitle("Seasonal plot: Product Sales")
items_soldseasonal_plot2 <- ggseasonplot(df2, polar=TRUE) +
ylab("Items sold (Thousand)") +
ggtitle("Seasonal plot: Sale of Products")
options(repr.plot.width = 14, repr.plot.height = 8)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(revenue_yearly_plot, items_soldyearly_plot, nrow=2, ncol=1)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

grid.arrange(revenue_seasonal_plot, items_soldseasonal_plot, nrow=2, ncol=1)

grid.arrange(revenue_seasonal_plot2, items_soldseasonal_plot2, nrow=2, ncol=1)

# Check how many years we have in the data
sales %>% select(year) %>% distinct() %>% count()
## n
## 1 3
year_sales = sales%>%group_by(year)%>%
summarise(sales_per_year = sum(item_price*item_cnt_day)) %>%
arrange(year)%>%
ungroup()
year_sales_plot <- ggplot(data=year_sales, aes(x = year, y = sales_per_year, fill = as.factor(year))) +
geom_bar(stat = "identity") +
labs(title= "Revenue Sales per Year", x= "Year", y = "Total Sales per Year", fill = "Year")
year_month_sales = sales%>%group_by(year, month)%>%
summarise(total_sales = sum(item_price*item_cnt_day)) %>%
arrange(year)%>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
year_month_sales_plot <- ggplot(data=year_month_sales, aes(x = year, y = total_sales, fill = as.factor(month))) +
geom_bar(stat = "identity") +
labs(title= "Year-Month Revenue Sales", x= "Year", y = "Year-Month Revenue Sales", fill = "Month")
# We want to forecast sales for month 11 and 12
highest_sales_day = sales%>%group_by(date)%>%
summarise(sales_per_day = sum(item_price*item_cnt_day)) %>%
arrange(desc(sales_per_day))%>%
ungroup()
highest_sales_day_plot <- ggplot(data=highest_sales_day, aes(x = date, y = sales_per_day)) +
geom_point(na.rm = TRUE, color = "blue", size = 0.3) +
(scale_x_date(breaks = date_breaks("7 months"), labels = date_format("%b %y"))) +
labs(title= "Count of Items Sold per Day", x= "Date(s)", y = "Total sales per day")
grid.arrange(year_sales_plot, year_month_sales_plot, nrow=2, ncol=1)

highest_sales_day_plot

# Decomposing the time series
df_revenue <- decompose(df1)
plot(df_revenue)

df_n_sales <- decompose(df2)
plot(df_n_sales)

# Testing whether the time series is stationary or not
# Augmented Dickey-Fuller Test (adf test). A p-Value of less than 0.05 in adf.test() indicates that it is stationary.
adf.test(df1)
##
## Augmented Dickey-Fuller Test
##
## data: df1
## Dickey-Fuller = -2.5878, Lag order = 3, p-value = 0.3452
## alternative hypothesis: stationary
adf.test(df2)
##
## Augmented Dickey-Fuller Test
##
## data: df2
## Dickey-Fuller = -2.7272, Lag order = 3, p-value = 0.2912
## alternative hypothesis: stationary
# Load test data
sales_test <- read.csv("test.csv", header=T)
head(sales_test)
## ID shop_id item_id
## 1 0 5 5037
## 2 1 5 5320
## 3 2 5 5233
## 4 3 5 5232
## 5 4 5 5268
## 6 5 5 5039
sales_test %>% select(item_id) %>% distinct() %>% count()
## n
## 1 5100
# Function to retrieve item_category_id based on item_id
get_item_category_id <- function(item_id, items) {
category_id <- items$item_category_id[items$item_id == item_id]
if (length(category_id) == 0) {
return(NA) # Return NA if item_name not found
} else {
return(category_id)
}
}
# Apply the function to the 'sales_test' dataframe
sales_test$item_category_id <- sapply(sales_test$item_id, get_item_category_id, items = items)
head(sales_test)
## ID shop_id item_id item_category_id
## 1 0 5 5037 19
## 2 1 5 5320 55
## 3 2 5 5233 19
## 4 3 5 5232 23
## 5 4 5 5268 20
## 6 5 5 5039 23
sales_test$shop_id <- factor(sales_test$shop_id)
sales_test$item_category_id <- factor(sales_test$item_category_id)
head(sales_test)
## ID shop_id item_id item_category_id
## 1 0 5 5037 19
## 2 1 5 5320 55
## 3 2 5 5233 19
## 4 3 5 5232 23
## 5 4 5 5268 20
## 6 5 5 5039 23
## Linear Regression Model for Number of Sales####
lr_items_model <- lm(item_cnt_day ~ shop_id + item_category_id, data = sales)
# Make predictions on sales_test data
sales_test_predictions <- predict(lr_items_model, newdata = sales_test)
total_sales <- sum(sales_test_predictions)
total_sales <- total_sales/1000
total_sales
## [1] 146.4998
## Linear Regression Model for Revenue ####
lr_revenue_model <- lm(item_price ~ shop_id + item_category_id, data = sales)
# Make predictions on sales_test data
sales_test_predictions_revenue <- predict(lr_revenue_model, newdata = sales_test)
total_revenue <- sum(sales_test_predictions_revenue)
total_revenue <- total_revenue/1000000
total_revenue
## [1] 197.2241
library(lightgbm)
##
## Attaching package: 'lightgbm'
## The following object is masked from 'package:plotly':
##
## slice
## The following object is masked from 'package:dplyr':
##
## slice
# Lightgbm model for revenue prediction
# Define the training data
train_data_revenue <- lgb.Dataset(data = as.matrix(sales[, c("shop_id", "item_category_id")]),
label = sales$item_price)
# Set LightGBM parameters
params <- list(objective = "regression",
metric = "rmse",
num_leaves = 31,
learning_rate = 0.05,
feature_fraction = 0.9,
bagging_fraction = 0.8,
bagging_freq = 5,
verbose = -1)
# Train the LightGBM model
lgb_model_revenue <- lgb.train(params = params,
data = train_data_revenue,
nrounds = 100)
# Make predictions on the sales_test dataset
revenue_test_predictions_lgb <- predict(lgb_model_revenue, as.matrix(sales_test[, c("shop_id", "item_category_id")]))
total_revenue_lgb <- sum(revenue_test_predictions_lgb)
total_revenue_lgb <- total_revenue_lgb/1000000
total_revenue_lgb
## [1] 191.8044
# Lightgbm model for number of sales prediction
# Define the training data
train_data_sales <- lgb.Dataset(data = as.matrix(sales[, c("shop_id", "item_category_id")]),
label = sales$item_cnt_day)
# Set LightGBM parameters
params <- list(objective = "regression",
metric = "rmse",
num_leaves = 31,
learning_rate = 0.05,
feature_fraction = 0.9,
bagging_fraction = 0.8,
bagging_freq = 5,
verbose = -1)
# Train the LightGBM model
lgb_model_sales <- lgb.train(params = params,
data = train_data_sales,
nrounds = 100)
# Make predictions on the sales_test dataset
sales_test_predictions_lgb <- predict(lgb_model_sales, as.matrix(sales_test[, c("shop_id", "item_category_id")]))
total_sales_lgb <- sum(sales_test_predictions_lgb)
total_sales_lgb <- total_sales_lgb/1000
total_sales_lgb
## [1] 244.4263
# 1 year Time Series Predictions for Revenue
df1_train <- head(df1, length(df1) - 9)
df1_test <- tail(df1, 9)
df1_test
## Feb Mar Apr May Jun Jul Aug Sep Oct
## 2015 73 71 59 59 56 54 54 59 65
df1_train_ts <- ts(df1_train, start=c(2013, 1), end=c(2015, 10), frequency=12)
df1_test_ts <- ts(df1_test,start=c(2013, 1), end=c(2016, 10), frequency=12)
df1_naive_mod <- naive(df1_train_ts, h = 12)
summary(df1_naive_mod)
##
## Forecast method: Naive method
##
## Model Information:
## Call: naive(y = df1_train_ts, h = 12)
##
## Residual sd: 25.6119
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.2727273 25.61191 15.06061 -2.988435 16.35679 1.428161
## ACF1
## Training set -0.2676921
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Nov 2015 73 40.1770226 105.8230 22.801588 123.1984
## Dec 2015 73 26.5813002 119.4187 2.008725 143.9913
## Jan 2016 73 16.1489354 129.8511 -13.946200 159.9462
## Feb 2016 73 7.3540451 138.6460 -27.396824 173.3968
## Mar 2016 73 -0.3944088 146.3944 -39.247062 185.2471
## Apr 2016 73 -7.3995465 153.3995 -49.960496 195.9605
## May 2016 73 -13.8414356 159.8414 -59.812515 205.8125
## Jun 2016 73 -19.8373997 165.8374 -68.982550 214.9826
## Jul 2016 73 -25.4689323 171.4689 -77.595236 223.5952
## Aug 2016 73 -30.7953683 176.7954 -85.741317 231.7413
## Sep 2016 73 -35.8615006 181.8615 -93.489298 239.4893
## Oct 2016 73 -40.7021291 186.7021 -100.892400 246.8924
df1_test$naive <- 73
## Warning in df1_test$naive <- 73: Coercing LHS to a list
df1_test$naive<- as.integer(df1_test$naive)
df1_res_naive <- mape(df1_test$naive, df1_test_ts) * 100
df1_res_naive
## [1] 15.9321
plot(df1, main="Forecast for Yearly/Monthly Revenue", xlab="Time", ylab="Millions of Dollars")
lines(df1_naive_mod$mean, col=4) #Naive method prediction

df1_se_model <- ses(df1_train_ts, h = 12)
summary(df1_se_model)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = df1_train_ts, h = 12)
##
## Smoothing parameters:
## alpha = 0.5527
##
## Initial states:
## l = 78.8837
##
## sigma: 24.4567
##
## AIC AICc BIC
## 341.2244 342.0244 345.8035
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.5289182 23.72644 14.85497 -4.367053 16.45306 1.408661
## ACF1
## Training set 0.07488592
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Nov 2015 68.94399 37.601535 100.2865 21.009839 116.8781
## Dec 2015 68.94399 33.132585 104.7554 14.175170 123.7128
## Jan 2016 68.94399 29.162537 108.7255 8.103506 129.7845
## Feb 2016 68.94399 25.554230 112.3338 2.585077 135.3029
## Mar 2016 68.94399 22.223775 115.6642 -2.508415 140.3964
## Apr 2016 68.94399 19.115427 118.7726 -7.262223 145.1502
## May 2016 68.94399 16.189910 121.6981 -11.736415 149.6244
## Jun 2016 68.94399 13.418318 124.4697 -15.975199 153.8632
## Jul 2016 68.94399 10.778644 127.1093 -20.012232 157.9002
## Aug 2016 68.94399 8.253672 129.6343 -23.873844 161.7618
## Sep 2016 68.94399 5.829634 132.0584 -27.581090 165.4691
## Oct 2016 68.94399 3.495314 134.3927 -31.151125 169.0391
df1_test$se <- 68.94
df1_test$se <- as.integer(df1_test$se)
df1_res_se <- mape(df1_test$se, df1_test_ts)*100
df1_res_se
## [1] 12.62788
autoplot(df1_se_model) +
autolayer(fitted(df1_se_model), series="Fitted") +
ylab("Revenue (millions of Dollars)") + xlab("Year")

df1_holt_model <- holt(df1,h=12)
df1_holt_model
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Nov 2015 61.35600 28.501365 94.21064 11.109170 111.6028
## Dec 2015 60.84255 22.010077 99.67501 1.453410 120.2317
## Jan 2016 60.32909 16.321929 104.33625 -6.974054 127.6322
## Feb 2016 59.81563 11.179887 108.45137 -14.566320 134.1976
## Mar 2016 59.30217 6.440304 112.16404 -21.543078 140.1474
## Apr 2016 58.78871 2.013237 115.56419 -28.041884 145.6193
## May 2016 58.27526 -2.162055 118.71257 -34.155634 150.7061
## Jun 2016 57.76180 -6.128878 121.65247 -39.950558 155.4742
## Jul 2016 57.24834 -9.919394 124.41607 -45.475845 159.9725
## Aug 2016 56.73488 -13.558267 127.02803 -50.769212 164.2390
## Sep 2016 56.22142 -17.064900 129.50775 -55.860337 168.3032
## Oct 2016 55.70797 -20.454886 131.87082 -60.773064 172.1890
df1_holt_model$mean
## Jan Feb Mar Apr May Jun Jul Aug
## 2015
## 2016 60.32909 59.81563 59.30217 58.78871 58.27526 57.76180 57.24834 56.73488
## Sep Oct Nov Dec
## 2015 61.35600 60.84255
## 2016 56.22142 55.70797
df1_holt <- as.data.frame(df1_holt_model)
df1_test$holt <- df1_holt$`Point Forecast`
df1_res_holt <- mape(df1_test$holt, df1_test_ts)*100
## Warning in `-.default`(actual, predicted): longer object length is not a
## multiple of shorter object length
## Warning in `/.default`(ae(actual, predicted), abs(actual)): longer object length
## is not a multiple of shorter object length
df1_res_holt
## [1] 9.66441
df1_holt_model_damp <- holt(df1,damped = TRUE ,phi = 0.9,h=15)
df1_holt_model_damp
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Nov 2015 62.03692 28.601883 95.47196 10.902442 113.1714
## Dec 2015 62.02293 22.724678 101.32118 1.921441 122.1244
## Jan 2016 62.01033 17.615345 106.40532 -5.885942 129.9066
## Feb 2016 61.99900 13.033927 110.96407 -12.886616 136.8846
## Mar 2016 61.98880 8.844352 115.13324 -19.288618 143.2662
## Apr 2016 61.97962 4.960606 118.99863 -25.223435 149.1827
## May 2016 61.97135 1.324206 122.61850 -30.780454 154.7232
## Jun 2016 61.96392 -2.106759 126.03459 -36.023725 159.9516
## Jul 2016 61.95722 -5.363545 129.27799 -41.001007 164.9155
## Aug 2016 61.95120 -8.470203 132.37260 -45.749041 169.6514
## Sep 2016 61.94578 -11.445707 135.33726 -50.296812 174.1884
## Oct 2016 61.94090 -14.305345 138.18714 -54.667667 178.5495
## Nov 2016 61.93651 -17.061646 140.93466 -58.880742 182.7538
## Dec 2016 61.93255 -19.725035 143.59014 -62.951951 186.8171
## Jan 2017 61.92900 -22.304293 146.16229 -66.894703 190.7527
autoplot(df1) +
autolayer(df1_holt_model, series="Holt's method", PI=FALSE) +
autolayer(df1_holt_model_damp, series="Damped Holt's method", PI=FALSE) +
ggtitle("Revenue forecasts from Holt's method") + xlab("Year") +
ylab("Revenue in Millions $)") +
guides(colour=guide_legend(title="Forecast"))

library(DT)
Model_df1 <- c('Naive','Simple Exponential Smoothing','Holt Trend Method')
MAPE_df1 <- c(df1_res_naive,df1_res_se,df1_res_holt)
df1_sales <- data.frame(Model_df1, MAPE_df1)
datatable(df1_sales)
# 1 year Time Series Predictions of number of items sold
df2_train <- head(df2, length(df2) - 12)
df2_test <- tail(df2, 12)
df2_test
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2014 118 169
## 2015 111 84 82 78 72 64 63 66 73 71
df2_train_ts <- ts(df2_train, start=c(2013, 1), end=c(2015, 10), frequency=12)
df2_test_ts <- ts(df2_test,start=c(2013, 1), end=c(2016, 10), frequency=12)
df2_naive_mod <- naive(df2_train_ts, h = 12)
summary(df2_naive_mod)
##
## Forecast method: Naive method
##
## Model Information:
## Call: naive(y = df2_train_ts, h = 12)
##
## Residual sd: 22.1838
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1.575758 22.18381 14.54545 -0.3224273 11.43132 0.5714286 -0.310782
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Nov 2015 183 154.57031 211.4297 139.52054 226.4795
## Dec 2015 183 142.79435 223.2057 121.51076 244.4892
## Jan 2016 183 133.75833 232.2417 107.69137 258.3086
## Feb 2016 183 126.14062 239.8594 96.04108 269.9589
## Mar 2016 183 119.42928 246.5707 85.77697 280.2230
## Apr 2016 183 113.36177 252.6382 76.49751 289.5025
## May 2016 183 107.78211 258.2179 67.96416 298.0358
## Jun 2016 183 102.58869 263.4113 60.02152 305.9785
## Jul 2016 183 97.71093 268.2891 52.56162 313.4384
## Aug 2016 183 93.09743 272.9026 45.50588 320.4941
## Sep 2016 183 88.70938 277.2906 38.79495 327.2051
## Oct 2016 183 84.51666 281.4833 32.38274 333.6173
df2_test$naive <- 133
## Warning in df2_test$naive <- 133: Coercing LHS to a list
df2_test$naive<- as.integer(df2_test$naive)
df2_res_naive <- mape(df2_test$naive, df2_test_ts) * 100
df2_res_naive
## [1] 38.34586
plot(df2, main="Forecast for Yearly/Monthly Items Sold", xlab="Time", ylab="Thousands of Sales")
lines(df2_naive_mod$mean, col=4) #Naive method prediction

df2_se_model <- ses(df2_train_ts, h = 12)
summary(df2_se_model)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = df2_train_ts, h = 12)
##
## Smoothing parameters:
## alpha = 0.4775
##
## Initial states:
## l = 129.5837
##
## sigma: 20.59
##
## AIC AICc BIC
## 329.5219 330.3219 334.1009
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1.541877 19.97525 14.39572 -0.61119 11.41915 0.5655461 0.01602038
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Nov 2015 154.6137 128.2265 181.0009 114.25800 194.9694
## Dec 2015 154.6137 125.3732 183.8542 109.89415 199.3332
## Jan 2016 154.6137 122.7745 186.4529 105.91982 203.3076
## Feb 2016 154.6137 120.3725 188.8549 102.24626 206.9811
## Mar 2016 154.6137 118.1282 191.0991 98.81402 210.4134
## Apr 2016 154.6137 116.0143 193.2131 95.58100 213.6464
## May 2016 154.6137 114.0102 195.2171 92.51607 216.7113
## Jun 2016 154.6137 112.1006 197.1268 89.59546 219.6319
## Jul 2016 154.6137 110.2730 198.9543 86.80053 222.4269
## Aug 2016 154.6137 108.5179 200.7095 84.11631 225.1111
## Sep 2016 154.6137 106.8272 202.4001 81.53061 227.6968
## Oct 2016 154.6137 105.1943 204.0330 79.03333 230.1941
df2_test$se <- 120.97
df2_test$se <- as.integer(df2_test$se)
df2_res_se <- mape(df2_test$se, df2_test_ts)*100
df2_res_se
## [1] 33.55072
autoplot(df2_se_model) +
autolayer(fitted(df2_se_model), series="Fitted") +
ylab("Thousands of Sales") + xlab("Year")

df2_holt_model <- holt(df2,h=12)
df2_holt_model
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Nov 2015 66.67013 38.143380 95.19687 23.042234 110.2980
## Dec 2015 64.75585 32.717167 96.79453 15.756913 113.7548
## Jan 2016 62.84157 27.638414 98.04473 9.002988 116.6802
## Feb 2016 60.92730 22.820462 99.03413 2.647923 119.2067
## Mar 2016 59.01302 18.207597 99.81844 -3.393489 121.4195
## Apr 2016 57.09874 13.761490 100.43599 -9.179865 123.3773
## May 2016 55.18447 9.454436 100.91450 -14.753580 125.1225
## Jun 2016 53.27019 5.265636 101.27474 -20.146440 126.6868
## Jul 2016 51.35591 1.179006 101.53282 -25.383045 128.0949
## Aug 2016 49.44164 -2.818197 101.70147 -30.482882 129.3662
## Sep 2016 47.52736 -6.736272 101.79099 -35.461703 130.5164
## Oct 2016 45.61308 -10.583684 101.80985 -40.332455 131.5586
df2_holt_model$mean
## Jan Feb Mar Apr May Jun Jul Aug
## 2015
## 2016 62.84157 60.92730 59.01302 57.09874 55.18447 53.27019 51.35591 49.44164
## Sep Oct Nov Dec
## 2015 66.67013 64.75585
## 2016 47.52736 45.61308
df2_holt <- as.data.frame(df2_holt_model)
df2_test$holt <- df2_holt$`Point Forecast`
df2_res_holt <- mape(df2_test$holt, df2_test_ts)*100
## Warning in `-.default`(actual, predicted): longer object length is not a
## multiple of shorter object length
## Warning in `-.default`(actual, predicted): longer object length is not a
## multiple of shorter object length
df2_res_holt
## [1] 53.63046
df2_holt_model_damp <- holt(df2,damped = TRUE ,phi = 0.9,h=15)
df2_holt_model_damp
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Nov 2015 70.39713 41.088650 99.7056 25.573680 115.2206
## Dec 2015 70.33203 36.665924 103.9981 18.844164 121.8199
## Jan 2016 70.27344 32.751379 107.7955 12.888398 127.6585
## Feb 2016 70.22072 29.202735 111.2387 7.489128 132.9523
## Mar 2016 70.17326 25.934049 114.4125 2.515225 137.8313
## Apr 2016 70.13055 22.888638 117.3725 -2.119719 142.3808
## May 2016 70.09211 20.026751 120.1575 -6.476251 146.6605
## Jun 2016 70.05752 17.319211 122.7958 -10.598763 150.7138
## Jul 2016 70.02638 14.743841 125.3089 -14.520970 154.5737
## Aug 2016 69.99836 12.283306 127.7134 -18.269198 158.2659
## Sep 2016 69.97314 9.923749 130.0225 -21.864479 161.8108
## Oct 2016 69.95044 7.653879 132.2470 -25.323930 165.2248
## Nov 2016 69.93002 5.464352 134.3957 -28.661710 168.5217
## Dec 2016 69.91163 3.347327 136.4759 -31.889687 171.7129
## Jan 2017 69.89509 1.296153 138.4940 -35.017928 174.8081
autoplot(df2) +
autolayer(df2_holt_model, series="Holt's method", PI=FALSE) +
autolayer(df2_holt_model_damp, series="Damped Holt's method", PI=FALSE) +
ggtitle("Sales forecasts from Holt's method") + xlab("Year") +
ylab("Thousands of Sales") +
guides(colour=guide_legend(title="Forecast"))

Model_df2 <- c('Naive','Simple Exponential Smoothing','Holt Trend Method')
MAPE_df2 <- c(df2_res_naive,df2_res_se,df2_res_holt)
df2_sales <- data.frame(Model_df2, MAPE_df2)
datatable(df2_sales)
summary(lgb_model_sales)
## LightGBM Model (100 trees)
## Objective: regression
## Fitted to dataset with 2 columns